#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(Rtsne)
library(ClusterR)
library(DESeq2)
library(expss)
library(knitr)
Load preprocessed dataset (preprocessing code in 20_03_30_data_preprocessing.Rmd)
# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Update DE_info with SFARI and Neuronal information
DE_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
distinct(ID, .keep_all = TRUE) %>% left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), significant=padj<0.05 & !is.na(padj))
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
cat(paste0('There are ', length(unique(SFARI_genes$`gene-symbol`)), ' genes with a SFARI score'))
## There are 979 genes with a SFARI score
The results from this section don’t change depending on the brain region analised, so I’m not going to repeat them here. These can be found in the folder where all the brain regions were analysed together
As in the previous section, the results from this section don’t change depending on the brain region analised, so I’m not going to repeat them here. These can be found in the folder where all the brain regions were analysed together
The higher the SFARI score, the higher the mean expression of the gene: This pattern is quite strong and it doesn’t have any biological interpretation, so it’s probably bias in the SFARI score assignment
The higher the SFARI score, the higher the standard deviation: This pattern is not as strong, but it is weird because the data was originally heteroscedastic with a positive relation between mean and variance, but this was supposed to have been corrected with the vst transformation
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(DE_info, by='ID')
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)
Just to corroborate that the relation between sd and SFARI score used to be in the opposite direction before the normalisation: The higher the SFARI score the higher the mean expression and the higher the standard deviation
*There are a lot of outliers, but the plot is interactive so you can zoom in
# Save preprocessed results
datExpr_prep = datExpr
datMeta_prep = datMeta
DE_info_prep = DE_info
load('./../Data/filtered_raw_data.RData')
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>%
left_join(DE_info, by='ID')
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
ggtitle('Mean Expression (left) and SD (right) by SFARI score') +
theme(legend.position='none'))
subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)
Return to normalised version of the data
# Save preprocessed results
datExpr = datExpr_prep
datMeta = datMeta_prep
DE_info = DE_info_prep
rm(datExpr_prep, datMeta_prep, DE_info_prep)
There seems to be a negative relation between SFARI score and log fold change when it would be expected to be either positively correlated or independent from each other (this last one because there are other factors that determine if a gene is releated to Autism apart from differences in gene expression)
Wikipedia mentions the likely explanation for this: “A disadvantage and serious risk of using fold change in this setting is that it is biased and may misclassify differentially expressed genes with large differences (B − A) but small ratios (B/A), leading to poor identification of changes at high expression levels”.
Based on this, since we saw there is a strong relation between SFARI score and mean expression, the bias in log fold change affects mainly genes with high SFARI scores, which would be the ones we are most interested in.
This pattern is stronger in the Gandal dataset
ggplotly(DE_info %>% ggplot(aes(x=gene.score, y=abs(log2FoldChange), fill=gene.score)) +
geom_boxplot() + scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
theme_minimal() + theme(legend.position='none'))
The higher the percentage of genes that get filtered by differential expression. This pattern is not as clear as with Gandal’s dataset
This pattern is generally consistent on all log fold change thresholds
In general, SFARI scores are more affected by the filtering than the genes with Neuronal-related functional annotations
SFARI gene group 1 now has the highest percentage of remaining genes (opposite to Gandal’s), although the number of genes with SFARI score 1 is small, so this could be unreliable
If we stick to the original null hypothesis \(H_0: lfc=0\), only 91/828 SFARI genes are statistically significant (~11%)
lfc_list = seq(1, 1.15, 0.005)
all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_info)))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_info$Neuronal)))
lfc_counts_all = DE_info %>% group_by(`gene-score`) %>% tally %>%
mutate('group'=as.factor(`gene-score`), 'n'=as.character(n)) %>%
dplyr::select(group, n) %>%
bind_rows(Neuronal_counts, all_counts) %>%
mutate('lfc'=-1) %>% dplyr::select(lfc, group, n)
for(lfc in lfc_list){
# Recalculate DE_info with the new threshold (p-values change)
DE_genes = results(dds, lfcThreshold=log2(lfc), altHypothesis='greaterAbs') %>% data.frame
DE_genes = DE_genes %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
distinct(ID, .keep_all = TRUE) %>% left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`))
DE_genes = DE_genes %>% filter(padj<0.05 & abs(log2FoldChange)>log2(lfc))
# Calculate counts by groups
all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_genes)))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_genes$Neuronal)))
lfc_counts = DE_genes %>% group_by(`gene-score`) %>% tally %>%
mutate('group'=`gene-score`, 'n'=as.character(n)) %>%
bind_rows(Neuronal_counts, all_counts) %>%
mutate('lfc'=lfc) %>% dplyr::select(lfc, group, n)
# Update lfc_counts_all
lfc_counts_all = lfc_counts_all %>% bind_rows(lfc_counts)
}
# Add missing entries with 0s
lfc_counts_all = expand.grid('group'=unique(lfc_counts_all$group), 'lfc'=unique(lfc_counts_all$lfc)) %>%
left_join(lfc_counts_all, by=c('group','lfc')) %>% replace(is.na(.), 0)
# Calculate percentage of each group remaining
tot_counts = DE_info %>% group_by(`gene-score`) %>% tally() %>% filter(`gene-score`!='None') %>%
mutate('group'=`gene-score`, 'tot'=n) %>% dplyr::select(group, tot) %>%
bind_rows(data.frame('group'='Neuronal', 'tot'=sum(DE_info$Neuronal)),
data.frame('group'='All', 'tot'=nrow(DE_info)))
lfc_counts_all = lfc_counts_all %>% filter(lfc!=-1, group!='None') %>%
left_join(tot_counts, by='group') %>% mutate('perc'=round(100*as.numeric(n)/tot,2))
# Plot change of number of genes
ggplotly(lfc_counts_all %>% ggplot(aes(lfc, perc, color=group)) + geom_point(aes(id=n)) + geom_line() +
scale_color_manual(values=SFARI_colour_hue(r=1:8)) + ylab('% of remaining genes') + xlab('Fold Change') +
ggtitle('Effect of filtering thresholds by SFARI score') + theme_minimal())
rm(lfc_list, all_counts, Neuronal_counts, lfc_counts_all, lfc, lfc_counts, lfc_counts_all, tot_counts, lfc_counts_all)
cat(paste0('There are ', sum(DE_info$padj<0.05 & DE_info$`gene-score` != 'None' & !is.na(DE_info$padj)),
' SFARI genes that are differentially expressed'))
## There are 87 SFARI genes that are differentially expressed
kable(DE_info %>% filter(padj<0.05 & `gene-score` %in% c(1,2,3) & !is.na(padj)) %>%
dplyr::select(ID, `gene-symbol`, log2FoldChange, padj, `gene-score`, Neuronal) %>% arrange(`gene-score`,padj),
caption = 'Top SFARI scores that are DE')
| ID | gene-symbol | log2FoldChange | padj | gene-score | Neuronal |
|---|---|---|---|---|---|
| ENSG00000251322 | SHANK3 | -0.4342702 | 0.0030963 | 1 | 1 |
| ENSG00000118058 | KMT2A | 0.4183848 | 0.0040453 | 1 | 0 |
| ENSG00000197283 | SYNGAP1 | -0.3340702 | 0.0249260 | 1 | 1 |
| ENSG00000168137 | SETD5 | 0.2987199 | 0.0252926 | 1 | 0 |
| ENSG00000157103 | SLC6A1 | -0.7070327 | 0.0001111 | 2 | 1 |
| ENSG00000174469 | CNTNAP2 | -0.2985477 | 0.0110806 | 2 | 1 |
| ENSG00000038382 | TRIO | -0.2589181 | 0.0349991 | 2 | 0 |
| ENSG00000177030 | DEAF1 | -0.2747991 | 0.0427330 | 2 | 0 |
| ENSG00000141027 | NCOR1 | 0.3393049 | 0.0452036 | 2 | 0 |
| ENSG00000079432 | CIC | -0.2131121 | 0.0469210 | 2 | 0 |
| ENSG00000215301 | DDX3X | -0.2608556 | 0.0486155 | 2 | 0 |
| ENSG00000136854 | STXBP1 | -0.3710927 | 0.0000790 | 3 | 1 |
| ENSG00000181722 | ZBTB20 | 0.8772604 | 0.0011067 | 3 | 0 |
| ENSG00000196628 | TCF4 | 0.4099807 | 0.0036609 | 3 | 1 |
| ENSG00000135439 | AGAP2 | -0.5040011 | 0.0040737 | 3 | 1 |
| ENSG00000157087 | ATP2B2 | -0.4749078 | 0.0057662 | 3 | 1 |
| ENSG00000124140 | SLC12A5 | -0.3447022 | 0.0059061 | 3 | 1 |
| ENSG00000196361 | ELAVL3 | 0.3181172 | 0.0158828 | 3 | 0 |
| ENSG00000146247 | PHIP | 0.4637417 | 0.0202719 | 3 | 0 |
| ENSG00000161681 | SHANK1 | -0.4667923 | 0.0282489 | 3 | 1 |
| ENSG00000114062 | UBE3A | 0.3068265 | 0.0356651 | 3 | 0 |
| ENSG00000074054 | CLASP1 | -0.3196193 | 0.0356693 | 3 | 0 |
| ENSG00000100241 | SBF1 | -0.2378509 | 0.0405028 | 3 | 0 |
| ENSG00000108557 | RAI1 | -0.3303705 | 0.0476327 | 3 | 0 |
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] knitr_1.28 expss_0.10.2
## [3] DESeq2_1.24.0 SummarizedExperiment_1.14.1
## [5] DelayedArray_0.10.0 BiocParallel_1.18.1
## [7] matrixStats_0.56.0 Biobase_2.44.0
## [9] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0
## [11] IRanges_2.18.3 S4Vectors_0.22.1
## [13] BiocGenerics_0.30.0 ClusterR_1.2.1
## [15] gtools_3.8.1 Rtsne_0.15
## [17] GGally_1.5.0 gridExtra_2.3
## [19] viridis_0.5.1 viridisLite_0.3.0
## [21] RColorBrewer_1.1-2 plotlyutils_0.0.0.9000
## [23] plotly_4.9.2 glue_1.3.2
## [25] reshape2_1.4.3 forcats_0.5.0
## [27] stringr_1.4.0 dplyr_0.8.5
## [29] purrr_0.3.3 readr_1.3.1
## [31] tidyr_1.0.2 tibble_3.0.0
## [33] ggplot2_3.3.0 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 ellipsis_0.3.0 htmlTable_1.13.3
## [4] XVector_0.24.0 base64enc_0.1-3 fs_1.3.2
## [7] rstudioapi_0.11 bit64_0.9-7 AnnotationDbi_1.46.1
## [10] fansi_0.4.1 lubridate_1.7.4 xml2_1.2.5
## [13] splines_3.6.3 geneplotter_1.62.0 Formula_1.2-3
## [16] jsonlite_1.6.1 annotate_1.62.0 broom_0.5.5
## [19] cluster_2.1.0 dbplyr_1.4.2 png_0.1-7
## [22] compiler_3.6.3 httr_1.4.1 backports_1.1.5
## [25] assertthat_0.2.1 Matrix_1.2-18 lazyeval_0.2.2
## [28] cli_2.0.2 acepack_1.4.1 htmltools_0.4.0
## [31] tools_3.6.3 gmp_0.5-13.6 gtable_0.3.0
## [34] GenomeInfoDbData_1.2.1 Rcpp_1.0.4 cellranger_1.1.0
## [37] vctrs_0.2.4 nlme_3.1-144 crosstalk_1.1.0.1
## [40] xfun_0.12 rvest_0.3.5 lifecycle_0.2.0
## [43] XML_3.99-0.3 zlibbioc_1.30.0 scales_1.1.0
## [46] hms_0.5.3 yaml_2.2.1 memoise_1.1.0
## [49] rpart_4.1-15 RSQLite_2.2.0 reshape_0.8.8
## [52] latticeExtra_0.6-29 stringi_1.4.6 highr_0.8
## [55] genefilter_1.66.0 checkmate_2.0.0 rlang_0.4.5
## [58] pkgconfig_2.0.3 bitops_1.0-6 evaluate_0.14
## [61] lattice_0.20-40 labeling_0.3 htmlwidgets_1.5.1
## [64] bit_1.1-15.2 tidyselect_1.0.0 plyr_1.8.6
## [67] magrittr_1.5 R6_2.4.1 generics_0.0.2
## [70] Hmisc_4.4-0 DBI_1.1.0 pillar_1.4.3
## [73] haven_2.2.0 foreign_0.8-75 withr_2.1.2
## [76] survival_3.1-11 RCurl_1.98-1.1 nnet_7.3-13
## [79] modelr_0.1.6 crayon_1.3.4 rmarkdown_2.1
## [82] jpeg_0.1-8.1 locfit_1.5-9.4 grid_3.6.3
## [85] readxl_1.3.1 data.table_1.12.8 blob_1.2.1
## [88] reprex_0.3.0 digest_0.6.25 xtable_1.8-4
## [91] munsell_0.5.0